Facial morphology is influenced by both genetic factors and environmental factors. correction of confounders, such as age and BMI, is to remove the variation of facial structure from confounders and to ensures the association signal from genetic efforts on face. Partial least squares regression (PLSR) was used to correct facial morphology for the confounders of sex, age, BMI and facial size.
We use the RV coefficient to calculate the three-dimensional correlation between each pair of vertex and construct a square affinity matrix that stores the weights between every vertex-pair. Smaller inter-vertex differences are getting higher weights. Then, the matrix is transformed into a graph Laplacian matrix, which in turn is decomposed into its k selected eigenvectors. The eigenvectors serve as new coordinates for the vertices. This operation enhances the (dis)similarities between the vertices and emphasizes the distinction between the clusters by placing the vertices into a more separable space. This relaxes the partitioning problem. In the eigenvector-space, the Euclidean distance serves as a proxy for the affinity and drives a k-means clustering to partition the vertices into k clusters.
Generalized Procrustes Analysis (GPA) was used to eliminate differences in position and orientation of each segment. Then, principal component analysis with parallel analysis was used to determine the number of significant components contributing to facial shape.
import netCDF4
import numpy as np
import pandas as pd
import pickle
import random
from scipy import stats
from scipy.spatial import procrustes
import sklearn.decomposition as decomp
import matplotlib.pyplot as plt
import sklearn.cluster as clt
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_samples, silhouette_score
import torch
torch.cuda.set_device(2)
torch.cuda.current_device()
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
from plotface import obj_to_mesh,plot3d,obj_data
vtx = netCDF4.Dataset('../../GWAS/data/taizhou2014_face.nc4','r')
vtx = vtx.variables['var1_1'][:].data
vtx.shape
vtx = vtx.reshape((2659,32251*3))
meta = pd.read_csv('../../GWAS/data/meta_scale.csv',index_col=0)
pls2 = PLSRegression(n_components=3)
pls2.fit(meta, vtx)
Y_pred = pls2.predict(meta)
vtx_new = vtx-Y_pred
out_file = netCDF4.Dataset('../../GWAS/data/taizhou014_face_rmcof.nc','w',format='NETCDF4')
out_file.createDimension('x',2659)
out_file.createDimension('y',32251*3)
vars_list = out_file.createVariable('var','float32',('x','y'))
vars_list[:] = vtx_new
vtx_new = netCDF4.Dataset('../../GWAS/data/taizhou014_face_rmcof.nc')
vtx_new = vtx_new.variables['var'][:].data
vtx_new.shape
vtx_new = vtx_new.reshape((2659,32251,3))
def rv_gpu(a,b):
aa = torch.matmul(a,a.transpose(2,1))
bb = torch.matmul(b,b.transpose(2,1))
atr = aa.pow(2).sum(1).sum(1).sqrt()
btr = bb.pow(2).sum(1).sum(1).sqrt()
abtr = torch.matmul(aa.view(aa.shape[0],-1),bb.view(bb.shape[0],-1).transpose(1,0))
return abtr/atr[:,None]/btr[None,:]
def vtx_rv(arr,step=500):
n = arr.shape[0]
rv_lst = []
for i in range(0,n,step):
rv = torch.cat([rv_gpu(arr[i:min(i+step,n),:,:],arr[j:min(j+step,n),:,:]) for j in range(0,n,step)],1)
rv_lst.append(rv.cpu())
return torch.cat(rv_lst,0)
vtx_t = vtx_new.transpose(1,0,2)
vtx_t=torch.from_numpy(vtx_t).cuda()
vtx_t.shape
%time frv = vtx_rv(vtx_t,step=40)
frv = frv.cpu().numpy()
np.save('../../GWAS/data/taizhou2014_all_rmcof_rvcoef.npy',frv)
frv = np.load('../../GWAS/data/taizhou2014_all_rmcof_rvcoef.npy')
plt.hist(frv.ravel())
face = np.loadtxt('../../GWAS/data/E0001_g.vtx')
Xn = frv.copy()
Xn[Xn>1]=1.0
Dsqrt = np.sqrt(Xn.sum(1))
Xn = Xn / Dsqrt[:,None] /Dsqrt[None,:]
pca1 = PCA(n_components=50)
pca1.fit(Xn)
plt.plot(pca1.explained_variance_ratio_,'o-')
plt.plot(pca1.singular_values_,'o-')
Vn = pca1.components_.T
Vn = Vn - Vn.mean(axis=0)
Vn = Vn/Vn.ptp(axis=0)
SSE = []
scl_in = []
silhouette_avg = []
plt.figure(figsize=(18, 4), dpi= 80, facecolor='w', edgecolor='k')
for k in range(2,18):
scl1 = clt.KMeans(n_clusters=k,random_state=0).fit(Vn[:,:11])
silhouette_avg.append(silhouette_score(Vn[:,:11],scl1.labels_))
scl_in.append(scl1.labels_)
SSE.append(scl1.inertia_)
plt.subplot(2,8,k-1)
plt.scatter(face[:,0],face[:,1],c=scl1.labels_)
plt.show()
plt.plot(range(2,18),SSE,'o-')
plt.plot(range(2,18),silhouette_avg,'o-')
scl_in = np.load('../impute2_examples/data/phenotype/scl_in_kmean.npy')
vertices, faces = obj_to_mesh(obj_data)
plot3d(vertices,scl_in[8],faces)
plot3d(vertices,scl_in[8]==9,faces)
plt.scatter(face[:,0],face[:,1],c=scl_in[8]==9)
pd.Series(scl_in[8]).value_counts()
for i in range(9):
seg=vtx_new[:,scl_in[8]==i,:]
a = seg[1733]
prc = [procrustes(a,b) for b in seg]
seg_prc = [p[1] for p in prc]
seg_prc = np.stack(seg_prc)
print(seg_prc.shape)
seg_prc = seg_prc.reshape((2659,-1))
np.savetxt("../impute2_examples/data/phenotype/seg"+str(i)+"_prc.csv",seg_prc,delimiter=',')